##### ####################################################
#####                                               ######
#####                 Influence prep             
#####                                               ######
##### ####################################################

rm(list= ls())
set.seed(221186)

## Libraries

library(quanteda) # 1.3.4
library(data.table) # 1.11.4
library(igraph) # 1.2.2
library(stargazer) # 5.2.2

## Data

load("data/speeches.Rdata")

### ################################################
### Add influence scores per speech, per debate
### ################################################

speeches$hpos <- as.numeric(speeches$hpos)
speeches$section_id <- as.numeric(speeches$section_id)
speeches$subsection_id <- as.numeric(speeches$subsection_id)

# Reorder debate to be organised by date, debate, sub-debate, and position in debate
setkey(speeches, hdate, section_id, subsection_id, hpos)

# Create quanteda corpus

speech_corpus <- corpus(speeches$body, docvars = speeches[,-grep("body", names(speeches)),with = F])

speech_dfm <- dfm(speech_corpus, remove_punct = T, remove_numbers = T)

speech_dfm <- dfm_trim(speech_dfm, sparsity = 0.998)

speech_dfm <- dfm_tfidf(speech_dfm)

speeches$eigen <- vector(mode = "numeric", length = dim(speeches)[1])
speeches$page_rank <- vector(mode = "numeric", length = dim(speeches)[1])
speeches$hub <- vector(mode = "numeric", length = dim(speeches)[1])
speeches$auth <- vector(mode = "numeric", length = dim(speeches)[1])
speeches$degree <- vector(mode = "numeric", length = dim(speeches)[1])

speeches$next.speech.sim <- vector(mode = "numeric", length = dim(speeches)[1])
speeches$next.speech.words <- vector(mode = "numeric", length = dim(speeches)[1])
speeches$next.speech.minister <- vector(mode = "logical", length = dim(speeches)[1])
speeches$next.speech.gender <- vector(mode = "character", length = dim(speeches)[1])
speeches$next.speech.party <- vector(mode = "character", length = dim(speeches)[1])

speeches$eigen <- NA
speeches$page_rank <- NA
speeches$hub <- NA
speeches$auth <- NA
speeches$degree <- NA

speeches$next.speech.sim <- NA # Create vector to hold the similarity with the next speech
speeches$next.speech.words <- NA	
speeches$next.speech.minister <- NA
speeches$next.speech.gender <- NA
speeches$next.speech.party <- NA

# Loop over debates

threshold <- 0.25 # Threshold for similarity

## Loop over governments

unique_years <- unique(speeches$calyear)
n_years <- length(unique_years)

for(g in 1:n_years){
  start <- proc.time()
  speeches_year <- speeches[calyear == unique_years[g]]
  speech_corpus_year <- corpus_subset(speech_corpus, calyear == unique_years[g])
  speech_dfm_year <- speech_dfm[which(speech_corpus$documents$calyear == unique_years[g]),]
  
  unique_section_ids_year <- unique(speech_corpus_year$documents$subsection_id)
  n_debates_year <- length(unique_section_ids_year)
  debate <- 1
  for(debate in 1:n_debates_year){
    cat(".")
    
    if((debate%%50)==0) print(round(debate/n_debates_year,3))
    if(debate%%500 == 0) {
      print(paste0(round((((proc.time() - start)[3]/debate)*(n_debates_year-debate))/60,3)," minutes remaining for this government (approx)."))
      print(paste0("Year ", g, " of ", n_years))
    }
    
    speeches_in_debate <- which(speech_corpus_year$documents$subsection_id == unique_section_ids_year[debate])
    
    if(length(speeches_in_debate) == 1) next
    
    cosinemat <- quanteda::textstat_simil(speech_dfm_year[speeches_in_debate,], margin="documents", method = "cosine")
    
    cosinemat_raw <- cosinemat <- as.matrix(cosinemat)
    
    cosinemat[is.na(cosinemat)] <- 0
    cosinemat[is.nan(cosinemat)] <- 0
    
    # Impose threshold
    cosinemat[cosinemat < threshold] <- 0
    undirected.cosine <- cosinemat
    
    cosinemat[upper.tri(cosinemat)] <- 0
    directed.cosine <- cosinemat
    
    # Influence scores ------------------------------------------------------------ 
    
    # Create the graph - speakers are nodes, edges are the similarity between speeches. Weights provided by the cosine similarity (adjacency) matrix
    mygraph.directed <- graph_from_adjacency_matrix(directed.cosine, diag=F ,mode="directed" ,weighted=T) 
    mygraph.undirected <- graph_from_adjacency_matrix(directed.cosine, diag=F ,mode="undirected" ,weighted=T) 
    
    # Calculate the page rank score
    page.rank.vec <- page.rank(mygraph.directed,directed=T)$vector
    
    # Calculate the eigenvector centrality scores
    eigen.vec <- eigen_centrality(mygraph.undirected, directed=F, options = list(maxiter = 10000))$vector
    
    # Calculate the hub and authority scores
    hub.vec <- hub_score(mygraph.directed)$vector
    auth.vec <- authority_score(mygraph.directed)$vector
    
    # Calculate out degreee scores
    degree.vec <- degree(mygraph.directed)
    
    # Next speech similarity ------------------------------------------------------------ 
    # Calculate the cosine similarity of all speeches in debate
    
    delta <- row(cosinemat_raw) - col(cosinemat_raw)
    next.speech.vec <- c(cosinemat_raw[delta==1], NA)
    next.speech <- speeches_year[speeches_in_debate[-1], c("word_count","minister_in_debate","Gender","party_short")]
    
    next.speech.words <- c(next.speech$word_count,NA)
    next.speech.minister <- c(next.speech$minister_in_debate,NA)
    next.speech.gender <- c(as.character(next.speech$Gender),NA)
    next.speech.party <- c(as.character(next.speech$party_short),NA)
    
    # Assign the scores back to the speakers
    if(length(page.rank.vec)==length(speeches_in_debate) & length(next.speech.vec) == length(speeches_in_debate)){
      
      speeches_year[speeches_in_debate, c("eigen", "page_rank", "hub", "auth", "degree", "next.speech.sim", "next.speech.words", "next.speech.minister", "next.speech.gender", "next.speech.party")] <- data.table(eigen.vec, page.rank.vec, hub.vec, auth.vec, degree.vec, next.speech.vec, next.speech.words, next.speech.minister, next.speech.gender, next.speech.party)
      
    }else{
      stop("lengths do not match")
    }
  } # End loop over debates 
  proc.time() - start
  
  speeches[calyear == unique_years[g],] <- speeches_year
  
}

# Recode next speech gender
speeches$next.speech.gender <- as.factor(speeches$next.speech.gender)
speeches$next.speech.gender <- factor(speeches$next.speech.gender,c("M","F"))

# Recode hpos

speeches[,hpos.new:=hpos-(min(hpos)-1),by=subsection_id]

# Code responsiveness

speeches$sim.words <- speeches$next.speech.sim * speeches$next.speech.words # Responsiveness measure

## Is the subsequent speech made by a member of the same party?

speeches$next.speech.same.party <- speeches$party_short == speeches$next.speech.party

save(speeches, file = "working/speeches_influence.Rdata")

### ##########################################
### Compare adjacent speeches (within a debate) to non-adjacent speeches
### ##########################################

set.seed(221186)

sample_adj <- function(body){
  
  a <- sample(body[-length(body)], 1)
  b <- body[which(body==a)[1]+1]
  
  return(c(a,b))
}

sample_non <- function(body){
  a <- sample(body, 1)
  apos <- which(body==a)[1]
  bpos <- sample(c(1:length(body))[-c(apos-1, apos, apos+1)],1)
  b <- body[bpos]
  return(c(a,b))
}

speeches[ , n_speeches_in_debate := length(body), by = subsection_id]
adj <- speeches[n_speeches_in_debate>20 , list(body = sample_adj(body)), by = subsection_id]
non <- speeches[n_speeches_in_debate>20 , list(body = sample_non(body)), by = subsection_id]

adj$adjacent <- TRUE
non$adjacent <- FALSE

adj_non <- rbind(adj, non)
adj_non$n_words <- ntoken(adj_non$body)

# Create quanteda corpus

adj_non_corpus <- corpus(adj_non$body, docvars = adj_non[,-grep("body", names(adj_non)),with = F])

adj_non_dfm <- dfm(adj_non_corpus, remove_punct = T, remove_numbers = T)

adj_non_dfm <- dfm_trim(adj_non_dfm, sparsity = 0.998)

adj_non_dfm <- dfm_tfidf(adj_non_dfm)

adj_non$sim <- vector("numeric", length(dim(adj_non)[1]))

for(d in 1:length(unique(adj_non$subsection_id))){
  
  if(d%%20 ==0) print(d/length(unique(adj_non$subsection_id)))
  
  this_debate <- which(adj_non$subsection_id == unique(adj_non$subsection_id)[d])
  
  this_debate_sim <- as.matrix(quanteda::textstat_simil(adj_non_dfm[this_debate,], margin="documents", method = "cosine"))
  
  adj_non[this_debate[1:2]]$sim <- this_debate_sim[1,2]
  adj_non[this_debate[3:4]]$sim <- this_debate_sim[3,4]
}

test <- adj_non[,list(sim = unique(sim), n_words = n_words[2]),by = list(subsection_id, adjacent)]

test$res <- test$sim*test$n_words

adj.mod1 <- lm(res ~ adjacent, data = test)

mod_stargazer <- function(...){
  output <- capture.output(stargazer(...))
  # The first three lines are the ones we want to remove...
  output <- output[grep("begin\\{tabular\\}",output):(grep("textit\\{Note:\\}",output)-1)]
  # cat out the results - this is essentially just what stargazer does too
  cat(paste(output, collapse = "\n"), "\n")
}

sink("latex/tables/adjacent_responsiveness.tex")
mod_stargazer(adj.mod1,covariate.labels="Adjacent",omit=c("section"),label="tab:adjacent_response",dep.var.labels="\\emph{res}",dep.var.caption="",title="Adjacent speeches are more responsive than non-adjacent speeches",keep.stat=c("n","rsq"), no.space=T)
sink()
